global EventTime event // should be "time" or "event" to control unit of analysis. "event" clusters high readings spaced less than 48 hours
global source Kyoto // should be "Kyoto" or "USGS"

set autotabgraphs on

if "$source"=="USGS" {
	use "USGS hourly Dst", clear
	gen double t = clock(string(mon) + " " +string(day) + " " + string(year) + " " + string(hr-1) + ":00", "MDYhm") // use non-UTC time so obs are perfectly spaced at 3.6 million milliseconds
}
else {
	use "World Data Center for Geomagnetism Dst.dta", clear
	gen double t = clock(date+ " " + time, "MDYhms") // use non-UTC time so obs are perfectly spaced at 3.6 million milliseconds
	drop date time
}

format t %tc
tsset t, delta(1 hour)
scalar T = (r(tmax)-r(tmin))/1000/60/60/24/365.25/10 // decades of data
gen double date = dofc(t)
format date %td
cap gen year=yofd(date)

* keep if t>=tc(01jan1964 00:00) & t<=tc(31mar2011 00:00) // approximate Riley sample

// seasonality
gen int week=wofd(dofc(t))
format week %tw
gen byte wofy=week(date)
graph bar (mean) Dst, over(wofy) name(Seasonality, replace)

if "$EventTime"=="event" {
	gen double _t = cond(Dst<=-100, t, .)
	gen int episode = 0
	replace episode = L.episode + (L.Dst>-100 & Dst<=-100) if _n>1
	collapse (first) date t year week (lastnm) end=_t (min) Dst, by (episode)
	replace episode = episode[_n-1] + (t-end[_n-1]>48*60*60*1000) if _n>1 // Tsubouchi and Omura require 48 hour gap for two events to be distinct
	collapse (first) date t year week (min) Dst, by (episode)
	drop if episode==0
}

gen cycle = sin((date-td(1jan2008))/11/365.25 * 2*_pi) // approximate 11-year sunspot cycle
gen negDst = -Dst
label var negDst "Absolute Dst"
sort negDst
gen long n = _n
qui gen double compcdf = 1-(_n-1)/_N
label var compcdf "Data"
scalar N = _N

scalar threshold = 150
sum compcdf if negDst>=threshold
scalar extremep = r(max)

extreme plot negDst, mrl(100 1000) name(MeanExcessPer${EventTime}) // mean residual life plot; for event-based analysis of Kyoto data looks linear above 150

// make fake data points for extrapolating to Carrington
qui forvalues i=`=ln(`=threshold')'(`=ln(850/`=threshold')/100')`=ln(850)+.00001' {
	set obs `=_N+1'
	replace negDst = exp(`i') in `=_N'
}
gen byte fake = _n>N
sort negDst

* excellent matches when *time* is the unit of analysis with Tsubouchi & Omura Table 1, rows 1-2
extreme gpd negDst if year<=2001 & negDst>280 & !fake, thresh(280) // Tsubouchi & Omura seem to exclude negDst=280
sum t if year<=2001
scalar p = 1-exp(-(1+(589-`e(threshold)')/exp([lnsig]_cons)*[xi]_cons)^(-1/[xi]_cons) * e(N)/((r(max)-r(min))/1000/60/60/24/365.25))
di "Per-year probality of -589: " p " Return time: " 1/p

extreme gpd negDst if year<=2003 & negDst>280 & !fake, thresh(200)
sum t if year<=2003
scalar p = 1-exp(-(1+(589-`e(threshold)')/exp([lnsig]_cons)*[xi]_cons)^(-1/[xi]_cons) * e(N)/((r(max)-r(min))/1000/60/60/24/365.25))
di "Per-year probality of -589: " p " Return time: " 1/p

* preferred
set seed 987654321
extreme gpd negDst if !fake, thresh(`=threshold')
scalar p = 1-exp(-(1+(589-`e(threshold)')/exp([lnsig]_cons)*[xi]_cons)^(-1/[xi]_cons) * e(N)/(T*10))
di "Per-year probality of -589: " p " Return time: " 1/p
di "Confidence interval for log return time:"
nlcom ln(1/(1-exp(-(1+(589-`e(threshold)')/exp([lnsig]_cons)*[xi]_cons)^(-1/[xi]_cons) * `e(N)'/(T*10))))
predictnl double gpdfit = (1+[xi]_cons*(negDst-threshold)/exp([lnsig]_cons))^(-1/[xi]_cons) * extremep if negDst>=threshold, force // cumulative CDF of GPD, scaled to total CDF of tail
label var gpdfit "Generalized Pareto fit"

extreme gpd negDst if !fake, thresh(`=threshold') sigvar(cycle) // allow lnsig to vary with sunspot cycle
scalar p = 1-exp(-(1+(589-`e(threshold)')/exp([lnsig]_cons+[lnsig]cycle)*[xi]_cons)^(-1/[xi]_cons) * e(N)/(T*10))
di "Per-year probality of -589 (solar activity peak): " p " Return time: " 1/p

preserve
keep if e(sample)
scalar extremep = compcdf
extreme plot, pp qq dens
restore

// bootstrap confidence intervals for extrapolated probabilities of extreme events
bootstrap shape=[xi]_cons scale=exp([lnsig]_cons) converged=e(converged) extremeN=e(N), seed(987654321) reps(1000) saving(DstGPDfit, replace): ///
		extreme gpd negDst if !fake, thresh(`=threshold') iter(100)
gen double gpdfit_lo= .
gen double gpdfit_hi= .
sort negDst
sum n if negDst>=threshold, meanonly
qui forvalues n=`r(min)'/`=_N' {
	scalar thisnegDst = negDst[`n']
	preserve
	use DstGPDfit, clear
	keep if converged
	gen double p = cond(1+shape/scale*(thisnegDst-threshold)>0, (1+shape/scale*(thisnegDst-threshold))^(-1/shape) * extremeN/N, 0) // GPD prob of event at least this big; extremep var supersedes extremep scalar
	centile p, centile(0 50 95)
	restore
	replace gpdfit_lo = r(c_1) in `n'
	replace gpdfit_hi = r(c_3) in `n'
	replace gpdfit = r(c_2) in `n'
}


* Riley-style power law fit -- produces a coefficient of 4.45. Riley (p. 8) says 3.2.
powerlaw negDst if !fake, thresh(120)
predictnl double lrileyfit = ln(exp(-([alpha]_cons - 1) * (ln(`e(depvar)') - ln(`e(threshold)'))) * e(N)/N), ci(lriley_lo lriley_hi) force // Scale log-log fit line to CDF of interval
qui gen double rileyfit=exp(lrileyfit)
label var rileyfit "Riley-style power law fit"
qui gen double riley_lo=exp(lriley_lo)
qui gen double riley_hi=exp(lriley_hi)

di "per event:  95% GPD CI for -850: [" %12.10f gpdfit_lo[_N] "," %12.10f gpdfit_hi[_N] "] per event. Median = " %12.10f gpdfit[_N]
di "per decade: 95% GPD CI for -850: [" 1-(1-gpdfit_lo[_N])^(N/T) "," 1-(1-gpdfit_hi[_N])^(N/T) "] per decade. Median = " 1-(1-gpdfit[_N])^(N/T)
di "per event:  95% Riley-style CI for -850: [" %12.10f riley_lo[_N] "," %12.10f riley_hi[_N] "] per event. Median = " %12.10f rileyfit[_N]
di "per decade: 95% Riley-style CI for -850: [" 1-(1-riley_lo[_N])^(N/T) "," 1-(1-riley_hi[_N])^(N/T) "] per decade. Median = " 1-(1-rileyfit[_N])^(N/T)

set scheme s1color

scatter compcdf negDst if !fake, msize(small) mcolor(black) || scatteri .1 250 (2) "10% of events above 250", mcolor(cranberry) mlabcolor(cranberry) mlabsize(medium) ///
  || scatteri .00268097 589 (12) "Quebec 1989 blackout", mcolor(cranberry) mlabcolor(cranberry) mlabsize(medium) ///
	|| if negDst>=100, xscale(log) yscale(log) ///
	xlabel(100(100)700 850 1000, labsize(medium)) xtick(100(100)1000) ylabel(.01 .1 1, labsize(medium)) ytick(.003(.001).01 .01(.01).1 .1(.1)1) ytick(.01 .1 1, grid) ///
	xtick(100(100)1000, grid) legend(off) xtitle("Absolute Dst", size(medium)) ///
	xline(850, lcolor(cranberry)) text(1 830 "Estimated" "Carrington" "level", place(sw) just(right) color(cranberry) size(medium)) xsize(6.5) ysize(4) ytitle("Probability earth's field weakens at least this much", size(medium)) legend(rows(1))
graph export "Complementary cumulative CDF of geomagnetic storm events as function of Dst, 1957-2014.png", replace

scatter compcdf negDst if !fake, msize(small) mcolor(black) || line rileyfit negDst if rileyfit<1, lwidth(medthick) lcolor(purple) || line gpdfit negDst, lwidth(medthick) lcolor(orange) ///
	|| if negDst>=100, xscale(log) yscale(log) ///
	xlabel(100(100)700 850 1000, labsize(medium)) xtick(100(100)1000) ylabel(.0001 .001 .01 .1 1, labsize(medium)) ytick(.0001(.0001).001 .001(.001).01 .01(.01).1 .1(.1)1) ///
	name(CompCDFPer${EventTime}, replace) xline(850) xline(850) xsize(6.5) ysize(4) ytitle("Probability field weakens at least this much", size(medium)) legend(rows(1)) ///
	xtick(100(100)1000, grid)  ytick(.01 .1 1, grid) xtitle(, size(medium))
graph export "Complementary cumulative CDF of geomagnetic storm events as function of Dst, 1957-2014, power law and GP fits.png", replace

gen _negDst = 1.004 * negDst
twoway rspike riley_lo riley_hi _negDst if fake, lcolor(purple) || rspike gpdfit_lo gpdfit_hi negDst if fake, lcolor(orange) || ///
	line rileyfit _negDst, lcolor(purple) || line gpdfit negDst, lcolor(orange) || ///
	scatter compcdf negDst, msize(small) mcolor(black) || ///
	if negDst>=300, xscale(log)  ///
	xlabel(300 400 500 600 700 850, labsize(medium)) ylabel(, labsize(medium)) legend(rows(1)) xtitle(, size(medium)) ///
	name(CompCDFPerEventCILinear, replace) xsize(6.5) ysize(4) legend(label(5 Data) rows(1) order(5 3 4)) xline(850) ytitle("Probability that field weakens at least this much", size(medium))
graph export "Complementary cumulative CDF of geomagnetic storm events as function of Dst, 1957-2014, power law and GP fits, with CIs.png", replace

// histogram of per-decade boostrapped probability estimates for -850-or-larger event
preserve
use DstGPDfit, clear
gen double perdecade = cond(1+shape/scale*(850-threshold)>0, 1-exp(-(1+shape/scale*(850-threshold))^(-1/shape) * extremeN/T), 0)
label var perdecade "Estimated probability/decade of -850 storm"
histogram perdecade, name(pvalues850, replace)
restore

